{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# Path to the JSON file\n",
    "json_path = \"/home/ubuntu/aco/KernelBench/results/timing/H100_PCIe_LambdaLabs/baseline_time_torch.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the JSON file\n",
    "with open(json_path, \"r\") as f:\n",
    "    data = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'level1': {'1_Square_matrix_multiplication_.py': {'mean': 3.75, 'std': 0.0347, 'min': 3.67, 'max': 3.83, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '2_Standard_matrix_multiplication_.py': {'mean': 4.56, 'std': 0.101, 'min': 4.2, 'max': 4.61, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '3_Batched_matrix_multiplication.py': {'mean': 67.7, 'std': 1.58, 'min': 64.0, 'max': 70.4, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '4_Matrix_vector_multiplication_.py': {'mean': 4.5, 'std': 0.00216, 'min': 4.5, 'max': 4.51, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '5_Matrix_scalar_multiplication.py': {'mean': 4.64, 'std': 0.00232, 'min': 4.63, 'max': 4.64, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '6_Matmul_with_large_K_dimension_.py': {'mean': 1.83, 'std': 0.0575, 'min': 1.71, 'max': 1.87, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '7_Matmul_with_small_K_dimension_.py': {'mean': 6.31, 'std': 0.216, 'min': 5.49, 'max': 6.91, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '8_Matmul_with_irregular_shapes_.py': {'mean': 7.98, 'std': 0.159, 'min': 7.42, 'max': 8.68, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '9_Tall_skinny_matrix_multiplication_.py': {'mean': 4.35, 'std': 0.157, 'min': 3.68, 'max': 4.41, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '10_3D_tensor_matrix_multiplication.py': {'mean': 1.53, 'std': 0.00152, 'min': 1.53, 'max': 1.54, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '11_4D_tensor_matrix_multiplication.py': {'mean': 26.4, 'std': 0.994, 'min': 25.1, 'max': 27.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '12_Matmul_with_diagonal_matrices_.py': {'mean': 3.68, 'std': 0.0138, 'min': 3.66, 'max': 3.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '13_Matmul_for_symmetric_matrices.py': {'mean': 3.78, 'std': 0.0484, 'min': 3.64, 'max': 3.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '14_Matmul_for_upper_triangular_matrices.py': {'mean': 3.7, 'std': 0.016, 'min': 3.69, 'max': 3.74, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '15_Matmul_for_lower_triangular_matrices.py': {'mean': 3.7, 'std': 0.00233, 'min': 3.69, 'max': 3.71, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '16_Matmul_with_transposed_A.py': {'mean': 3.5, 'std': 0.034, 'min': 3.41, 'max': 3.52, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '17_Matmul_with_transposed_B.py': {'mean': 5.0, 'std': 0.11, 'min': 4.56, 'max': 5.07, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '18_Matmul_with_transposed_both.py': {'mean': 3.62, 'std': 0.031, 'min': 3.54, 'max': 3.65, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '19_ReLU.py': {'mean': 6.95, 'std': 0.00262, 'min': 6.95, 'max': 6.96, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '20_LeakyReLU.py': {'mean': 6.95, 'std': 0.00342, 'min': 6.95, 'max': 6.98, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '21_Sigmoid.py': {'mean': 6.95, 'std': 0.00245, 'min': 6.94, 'max': 6.95, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '22_Tanh.py': {'mean': 6.95, 'std': 0.00262, 'min': 6.94, 'max': 6.96, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '23_Softmax.py': {'mean': 14.3, 'std': 0.0301, 'min': 14.2, 'max': 14.4, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '24_LogSoftmax.py': {'mean': 13.8, 'std': 0.0102, 'min': 13.8, 'max': 13.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '25_Swish.py': {'mean': 17.1, 'std': 0.00447, 'min': 17.1, 'max': 17.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '26_GELU_.py': {'mean': 6.95, 'std': 0.003, 'min': 6.94, 'max': 6.96, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '27_SELU_.py': {'mean': 6.95, 'std': 0.00191, 'min': 6.95, 'max': 6.96, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '28_HardSigmoid.py': {'mean': 6.95, 'std': 0.00296, 'min': 6.95, 'max': 6.96, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '29_Softplus.py': {'mean': 6.97, 'std': 0.00178, 'min': 6.96, 'max': 6.97, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '30_Softsign.py': {'mean': 24.1, 'std': 0.00542, 'min': 24.1, 'max': 24.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '31_ELU.py': {'mean': 6.95, 'std': 0.00183, 'min': 6.95, 'max': 6.96, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '32_HardTanh.py': {'mean': 6.95, 'std': 0.00331, 'min': 6.95, 'max': 6.97, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '33_BatchNorm.py': {'mean': 19.6, 'std': 0.0665, 'min': 19.5, 'max': 19.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '34_InstanceNorm.py': {'mean': 13.9, 'std': 0.00459, 'min': 13.9, 'max': 13.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '35_GroupNorm_.py': {'mean': 13.8, 'std': 0.139, 'min': 13.7, 'max': 14.0, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '36_RMSNorm_.py': {'mean': 21.7, 'std': 0.0731, 'min': 21.6, 'max': 21.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '37_FrobeniusNorm_.py': {'mean': 12.8, 'std': 0.0115, 'min': 12.8, 'max': 12.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '38_L1Norm_.py': {'mean': 24.0, 'std': 0.00625, 'min': 24.0, 'max': 24.0, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '39_L2Norm_.py': {'mean': 14.8, 'std': 0.0416, 'min': 14.7, 'max': 14.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '40_LayerNorm.py': {'mean': 8.94, 'std': 0.0056, 'min': 8.92, 'max': 8.96, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '41_Max_Pooling_1D.py': {'mean': 27.9, 'std': 0.104, 'min': 27.9, 'max': 28.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '42_Max_Pooling_2D.py': {'mean': 28.0, 'std': 0.00661, 'min': 28.0, 'max': 28.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '43_Max_Pooling_3D.py': {'mean': 5.27, 'std': 0.044, 'min': 5.12, 'max': 5.34, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '44_Average_Pooling_1D.py': {'mean': 21.2, 'std': 0.00363, 'min': 21.2, 'max': 21.2, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '45_Average_Pooling_2D.py': {'mean': 9.76, 'std': 0.0233, 'min': 9.7, 'max': 9.81, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '46_Average_Pooling_3D.py': {'mean': 11.4, 'std': 0.145, 'min': 11.3, 'max': 11.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '47_Sum_reduction_over_a_dimension.py': {'mean': 4.6, 'std': 0.0318, 'min': 4.53, 'max': 4.69, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '48_Mean_reduction_over_a_dimension.py': {'mean': 4.6, 'std': 0.0345, 'min': 4.53, 'max': 4.73, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '49_Max_reduction_over_a_dimension.py': {'mean': 4.76, 'std': 0.0121, 'min': 4.72, 'max': 4.78, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '50_conv_standard_2D__square_input__square_kernel.py': {'mean': 1.2, 'std': 0.00481, 'min': 1.19, 'max': 1.21, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '51_Argmax_over_a_dimension.py': {'mean': 4.75, 'std': 0.0136, 'min': 4.7, 'max': 4.77, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '52_Argmin_over_a_dimension.py': {'mean': 4.75, 'std': 0.0155, 'min': 4.7, 'max': 4.77, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '53_Min_reduction_over_a_dimension.py': {'mean': 4.76, 'std': 0.0122, 'min': 4.72, 'max': 4.78, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '54_conv_standard_3D__square_input__square_kernel.py': {'mean': 1.79, 'std': 0.00298, 'min': 1.79, 'max': 1.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '55_conv_standard_2D__asymmetric_input__square_kernel.py': {'mean': 6.44, 'std': 0.184, 'min': 6.02, 'max': 6.83, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py': {'mean': 5.24, 'std': 0.116, 'min': 4.7, 'max': 5.29, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '57_conv_transposed_2D__square_input__square_kernel.py': {'mean': 9.2, 'std': 0.203, 'min': 9.01, 'max': 9.75, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py': {'mean': 2.62, 'std': 0.0211, 'min': 2.58, 'max': 2.69, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '59_conv_standard_3D__asymmetric_input__square_kernel.py': {'mean': 2.74, 'std': 0.0288, 'min': 2.68, 'max': 2.79, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '60_conv_standard_3D__square_input__asymmetric_kernel.py': {'mean': 7.17, 'std': 0.244, 'min': 6.94, 'max': 7.51, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '61_conv_transposed_3D__square_input__square_kernel.py': {'mean': 6.44, 'std': 0.0194, 'min': 6.38, 'max': 6.48, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '62_conv_standard_2D__square_input__asymmetric_kernel.py': {'mean': 5.58, 'std': 0.123, 'min': 4.98, 'max': 5.64, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '63_conv_standard_2D__square_input__square_kernel.py': {'mean': 10.1, 'std': 0.36, 'min': 9.69, 'max': 10.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '64_conv_transposed_1D.py': {'mean': 8.05, 'std': 0.0243, 'min': 7.99, 'max': 8.14, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '65_conv_transposed_2D__square_input__asymmetric_kernel.py': {'mean': 3.58, 'std': 0.0705, 'min': 3.39, 'max': 3.67, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py': {'mean': 3.46, 'std': 0.0267, 'min': 3.45, 'max': 3.59, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '67_conv_standard_1D.py': {'mean': 3.99, 'std': 0.0771, 'min': 3.76, 'max': 4.06, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '68_conv_transposed_3D__square_input__asymmetric_kernel.py': {'mean': 12.1, 'std': 0.385, 'min': 11.4, 'max': 12.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py': {'mean': 4.32, 'std': 0.0495, 'min': 4.19, 'max': 4.35, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '70_conv_transposed_3D__asymmetric_input__square_kernel.py': {'mean': 12.1, 'std': 0.296, 'min': 11.9, 'max': 12.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '71_conv_transposed_2D__asymmetric_input__square_kernel.py': {'mean': 2.22, 'std': 0.0253, 'min': 2.17, 'max': 2.25, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py': {'mean': 3.22, 'std': 0.00616, 'min': 3.21, 'max': 3.24, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py': {'mean': 3.03, 'std': 0.0163, 'min': 3.0, 'max': 3.08, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '74_conv_transposed_1D_dilated.py': {'mean': 2.9, 'std': 0.0111, 'min': 2.87, 'max': 2.92, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py': {'mean': 7.29, 'std': 0.00628, 'min': 7.27, 'max': 7.31, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '76_conv_standard_1D_dilated_strided__.py': {'mean': 19.1, 'std': 0.185, 'min': 18.9, 'max': 19.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py': {'mean': 2.43, 'std': 0.021, 'min': 2.38, 'max': 2.48, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py': {'mean': 3.2, 'std': 0.056, 'min': 3.11, 'max': 3.28, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py': {'mean': 2.7, 'std': 0.0159, 'min': 2.64, 'max': 2.73, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py': {'mean': 5.52, 'std': 0.118, 'min': 4.92, 'max': 5.56, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py': {'mean': 2.52, 'std': 0.0305, 'min': 2.48, 'max': 2.65, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '82_conv_depthwise_2D_square_input_square_kernel.py': {'mean': 3.36, 'std': 0.00189, 'min': 3.35, 'max': 3.37, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '83_conv_depthwise_2D_square_input_asymmetric_kernel.py': {'mean': 1.93, 'std': 0.0029, 'min': 1.93, 'max': 1.95, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '84_conv_depthwise_2D_asymmetric_input_square_kernel.py': {'mean': 13.3, 'std': 0.00175, 'min': 13.3, 'max': 13.3, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py': {'mean': 3.03, 'std': 0.00195, 'min': 3.03, 'max': 3.04, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '86_conv_depthwise_separable_2D.py': {'mean': 5.16, 'std': 0.053, 'min': 5.11, 'max': 5.26, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '87_conv_pointwise_2D.py': {'mean': 7.63, 'std': 0.00608, 'min': 7.62, 'max': 7.65, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '88_MinGPTNewGelu.py': {'mean': 2.64, 'std': 0.00204, 'min': 2.63, 'max': 2.64, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '89_cumsum.py': {'mean': 7.33, 'std': 0.00905, 'min': 7.31, 'max': 7.35, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '90_cumprod.py': {'mean': 7.33, 'std': 0.00783, 'min': 7.31, 'max': 7.35, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '91_cumsum_reverse.py': {'mean': 17.5, 'std': 0.0215, 'min': 17.3, 'max': 17.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '92_cumsum_exclusive.py': {'mean': 12.8, 'std': 0.0179, 'min': 12.7, 'max': 12.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '93_masked_cumsum.py': {'mean': 13.2, 'std': 0.00994, 'min': 13.2, 'max': 13.2, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '94_MSELoss.py': {'mean': 13.6, 'std': 0.00396, 'min': 13.6, 'max': 13.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '95_CrossEntropyLoss.py': {'mean': 1.67, 'std': 0.00297, 'min': 1.67, 'max': 1.68, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '96_HuberLoss.py': {'mean': 9.0, 'std': 0.00435, 'min': 8.99, 'max': 9.02, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '97_ScaledDotProductAttention.py': {'mean': 0.243, 'std': 0.00352, 'min': 0.23, 'max': 0.253, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '98_KLDivLoss.py': {'mean': 6.29, 'std': 0.00291, 'min': 6.28, 'max': 6.29, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '99_TripletMarginLoss.py': {'mean': 6.85, 'std': 0.00288, 'min': 6.84, 'max': 6.85, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '100_HingeLoss.py': {'mean': 16.6, 'std': 0.00368, 'min': 16.6, 'max': 16.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}}, 'level2': {'1_Conv2D_ReLU_BiasAdd.py': {'mean': 6.61, 'std': 0.0175, 'min': 6.56, 'max': 6.65, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '2_ConvTranspose2d_BiasAdd_Clamp_Scaling_Clamp_Divide.py': {'mean': 19.9, 'std': 0.00608, 'min': 19.9, 'max': 19.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '3_ConvTranspose3d_Sum_LayerNorm_AvgPool_GELU.py': {'mean': 62.4, 'std': 0.0103, 'min': 62.4, 'max': 62.4, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '4_Conv2d_Mish_Mish.py': {'mean': 27.3, 'std': 0.612, 'min': 26.2, 'max': 28.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '5_ConvTranspose2d_Subtract_Tanh.py': {'mean': 55.8, 'std': 0.324, 'min': 55.3, 'max': 56.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '6_Conv3d_Softmax_MaxPool_MaxPool.py': {'mean': 1.46, 'std': 0.0133, 'min': 1.45, 'max': 1.51, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py': {'mean': 28.0, 'std': 0.304, 'min': 27.8, 'max': 28.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '8_Conv3d_Divide_Max_GlobalAvgPool_BiasAdd_Sum.py': {'mean': 8.51, 'std': 0.00287, 'min': 8.51, 'max': 8.53, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '9_Matmul_Subtract_Multiply_ReLU.py': {'mean': 4.86, 'std': 0.0469, 'min': 4.75, 'max': 4.92, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '10_ConvTranspose2d_MaxPool_Hardtanh_Mean_Tanh.py': {'mean': 14.6, 'std': 0.294, 'min': 14.1, 'max': 15.2, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '11_ConvTranspose2d_BatchNorm_Tanh_MaxPool_GroupNorm.py': {'mean': 3.82, 'std': 0.0179, 'min': 3.77, 'max': 3.85, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '12_Gemm_Multiply_LeakyReLU.py': {'mean': 4.82, 'std': 0.0413, 'min': 4.71, 'max': 4.88, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '13_ConvTranspose3d_Mean_Add_Softmax_Tanh_Scaling.py': {'mean': 13.5, 'std': 0.32, 'min': 13.1, 'max': 14.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '14_Gemm_Divide_Sum_Scaling.py': {'mean': 5.05, 'std': 0.112, 'min': 4.58, 'max': 5.19, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '15_ConvTranspose3d_BatchNorm_Subtract.py': {'mean': 2.62, 'std': 0.0116, 'min': 2.6, 'max': 2.66, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '16_ConvTranspose2d_Mish_Add_Hardtanh_Scaling.py': {'mean': 17.3, 'std': 0.0291, 'min': 17.3, 'max': 17.4, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '17_Conv2d_InstanceNorm_Divide.py': {'mean': 7.45, 'std': 0.0187, 'min': 7.38, 'max': 7.51, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '18_Matmul_Sum_Max_AvgPool_LogSumExp_LogSumExp.py': {'mean': 4.85, 'std': 0.0459, 'min': 4.71, 'max': 5.02, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '19_ConvTranspose2d_GELU_GroupNorm.py': {'mean': 18.3, 'std': 0.277, 'min': 17.9, 'max': 19.0, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '20_ConvTranspose3d_Sum_ResidualAdd_Multiply_ResidualAdd.py': {'mean': 5.61, 'std': 0.00467, 'min': 5.6, 'max': 5.62, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '21_Conv2d_Add_Scale_Sigmoid_GroupNorm.py': {'mean': 10.5, 'std': 0.152, 'min': 10.3, 'max': 10.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '22_Matmul_Scale_ResidualAdd_Clamp_LogSumExp_Mish.py': {'mean': 4.99, 'std': 0.0272, 'min': 4.91, 'max': 5.02, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '23_Conv3d_GroupNorm_Mean.py': {'mean': 2.23, 'std': 0.0028, 'min': 2.23, 'max': 2.25, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '24_Conv3d_Min_Softmax.py': {'mean': 1.76, 'std': 0.00249, 'min': 1.75, 'max': 1.77, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '25_Conv2d_Min_Tanh_Tanh.py': {'mean': 9.91, 'std': 0.327, 'min': 9.49, 'max': 10.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '26_ConvTranspose3d_Add_HardSwish.py': {'mean': 8.09, 'std': 0.00497, 'min': 8.08, 'max': 8.11, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '27_Conv3d_HardSwish_GroupNorm_Mean.py': {'mean': 1.37, 'std': 0.00326, 'min': 1.36, 'max': 1.38, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '28_BMM_InstanceNorm_Sum_ResidualAdd_Multiply.py': {'mean': 4.94, 'std': 0.0383, 'min': 4.84, 'max': 4.99, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '29_Matmul_Mish_Mish.py': {'mean': 4.85, 'std': 0.0594, 'min': 4.66, 'max': 4.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '30_Gemm_GroupNorm_Hardtanh.py': {'mean': 4.98, 'std': 0.0569, 'min': 4.8, 'max': 5.05, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '31_Conv2d_Min_Add_Multiply.py': {'mean': 7.76, 'std': 0.0327, 'min': 7.71, 'max': 8.0, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '32_Conv2d_Scaling_Min.py': {'mean': 24.9, 'std': 0.614, 'min': 24.0, 'max': 26.2, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '33_Gemm_Scale_BatchNorm.py': {'mean': 4.92, 'std': 0.0514, 'min': 4.75, 'max': 4.96, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '34_ConvTranspose3d_LayerNorm_GELU_Scaling.py': {'mean': 62.9, 'std': 0.16, 'min': 62.2, 'max': 63.2, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '35_Conv2d_Subtract_HardSwish_MaxPool_Mish.py': {'mean': 7.97, 'std': 0.0793, 'min': 7.87, 'max': 8.3, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '36_ConvTranspose2d_Min_Sum_GELU_Add.py': {'mean': 2.22, 'std': 0.00415, 'min': 2.21, 'max': 2.23, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '37_Matmul_Swish_Sum_GroupNorm.py': {'mean': 12.2, 'std': 0.455, 'min': 11.3, 'max': 12.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '38_ConvTranspose3d_AvgPool_Clamp_Softmax_Multiply.py': {'mean': 1.16, 'std': 0.00329, 'min': 1.16, 'max': 1.17, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '39_Gemm_Scale_BatchNorm.py': {'mean': 17.1, 'std': 0.581, 'min': 15.8, 'max': 18.0, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '40_Matmul_Scaling_ResidualAdd.py': {'mean': 17.2, 'std': 0.592, 'min': 16.4, 'max': 18.3, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '41_Gemm_BatchNorm_GELU_ReLU.py': {'mean': 17.4, 'std': 0.601, 'min': 16.5, 'max': 18.3, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '42_ConvTranspose2d_GlobalAvgPool_BiasAdd_LogSumExp_Sum_Multiply.py': {'mean': 10.8, 'std': 0.112, 'min': 10.7, 'max': 11.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '43_Conv3d_Max_LogSumExp_ReLU.py': {'mean': 4.02, 'std': 0.0753, 'min': 3.75, 'max': 4.07, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py': {'mean': 2.74, 'std': 0.00333, 'min': 2.74, 'max': 2.75, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '45_Gemm_Sigmoid_LogSumExp.py': {'mean': 12.9, 'std': 0.563, 'min': 11.6, 'max': 13.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '46_Conv2d_Subtract_Tanh_Subtract_AvgPool.py': {'mean': 9.37, 'std': 0.112, 'min': 9.23, 'max': 9.67, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '47_Conv3d_Mish_Tanh.py': {'mean': 3.98, 'std': 0.0728, 'min': 3.75, 'max': 4.06, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '48_Conv3d_Scaling_Tanh_Multiply_Sigmoid.py': {'mean': 5.9, 'std': 0.0124, 'min': 5.89, 'max': 5.93, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '49_ConvTranspose3d_Softmax_Sigmoid.py': {'mean': 4.82, 'std': 0.0191, 'min': 4.77, 'max': 4.85, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '50_ConvTranspose3d_Scaling_AvgPool_BiasAdd_Scaling.py': {'mean': 7.68, 'std': 0.0488, 'min': 7.63, 'max': 7.87, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '51_Gemm_Subtract_GlobalAvgPool_LogSumExp_GELU_ResidualAdd.py': {'mean': 8.41, 'std': 0.234, 'min': 7.86, 'max': 9.15, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '52_Conv2d_Activation_BatchNorm.py': {'mean': 5.47, 'std': 0.019, 'min': 5.38, 'max': 5.51, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '53_Gemm_Scaling_Hardtanh_GELU.py': {'mean': 8.33, 'std': 0.19, 'min': 7.84, 'max': 9.0, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '54_Conv2d_Multiply_LeakyReLU_GELU.py': {'mean': 9.24, 'std': 0.0943, 'min': 9.13, 'max': 9.52, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '55_Matmul_MaxPool_Sum_Scale.py': {'mean': 10.9, 'std': 0.479, 'min': 9.52, 'max': 11.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '56_Matmul_Sigmoid_Sum.py': {'mean': 10.9, 'std': 0.423, 'min': 9.79, 'max': 11.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '57_Conv2d_ReLU_HardSwish.py': {'mean': 4.59, 'std': 0.00469, 'min': 4.58, 'max': 4.61, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '58_ConvTranspose3d_LogSumExp_HardSwish_Subtract_Clamp.py': {'mean': 9.81, 'std': 0.0879, 'min': 9.68, 'max': 9.97, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '59_Matmul_Swish_Scaling.py': {'mean': 10.9, 'std': 0.451, 'min': 9.25, 'max': 11.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '60_ConvTranspose3d_Swish_GroupNorm_HardSwish.py': {'mean': 11.3, 'std': 0.0816, 'min': 11.3, 'max': 11.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '61_ConvTranspose3d_ReLU_GroupNorm.py': {'mean': 3.0, 'std': 0.0134, 'min': 2.96, 'max': 3.01, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '62_Matmul_GroupNorm_LeakyReLU_Sum.py': {'mean': 1.29, 'std': 0.00824, 'min': 1.28, 'max': 1.32, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '63_Gemm_ReLU_Divide.py': {'mean': 4.89, 'std': 0.0606, 'min': 4.71, 'max': 4.98, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '64_Gemm_LogSumExp_LeakyReLU_LeakyReLU_GELU_GELU.py': {'mean': 4.96, 'std': 0.0516, 'min': 4.81, 'max': 5.04, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '65_Conv2d_AvgPool_Sigmoid_Sum.py': {'mean': 18.3, 'std': 0.495, 'min': 17.6, 'max': 19.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '66_Matmul_Dropout_Softmax.py': {'mean': 2.18, 'std': 0.0338, 'min': 2.11, 'max': 2.23, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '67_Conv2d_GELU_GlobalAvgPool.py': {'mean': 9.49, 'std': 0.19, 'min': 9.31, 'max': 9.99, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '68_Matmul_Min_Subtract.py': {'mean': 2.17, 'std': 0.0261, 'min': 2.12, 'max': 2.2, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '69_Conv2d_HardSwish_ReLU.py': {'mean': 2.64, 'std': 0.00359, 'min': 2.63, 'max': 2.65, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '70_Gemm_Sigmoid_Scaling_ResidualAdd.py': {'mean': 4.94, 'std': 0.0579, 'min': 4.77, 'max': 5.02, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '71_Conv2d_Divide_LeakyReLU.py': {'mean': 2.64, 'std': 0.00421, 'min': 2.63, 'max': 2.65, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '72_ConvTranspose3d_BatchNorm_AvgPool_AvgPool.py': {'mean': 29.2, 'std': 0.0732, 'min': 29.1, 'max': 29.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '73_Conv2d_BatchNorm_Scaling.py': {'mean': 3.51, 'std': 0.00845, 'min': 3.5, 'max': 3.53, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '74_ConvTranspose3d_LeakyReLU_Multiply_LeakyReLU_Max.py': {'mean': 2.16, 'std': 0.00301, 'min': 2.16, 'max': 2.17, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '75_Gemm_GroupNorm_Min_BiasAdd.py': {'mean': 5.23, 'std': 0.0434, 'min': 5.1, 'max': 5.27, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '76_Gemm_Add_ReLU.py': {'mean': 5.19, 'std': 0.132, 'min': 4.61, 'max': 5.24, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '77_ConvTranspose3d_Scale_BatchNorm_GlobalAvgPool.py': {'mean': 5.72, 'std': 0.144, 'min': 5.37, 'max': 6.13, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '78_ConvTranspose3d_Max_Max_Sum.py': {'mean': 5.51, 'std': 0.0409, 'min': 5.4, 'max': 5.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '79_Conv3d_Multiply_InstanceNorm_Clamp_Multiply_Max.py': {'mean': 1.64, 'std': 0.0152, 'min': 1.63, 'max': 1.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '80_Gemm_Max_Subtract_GELU.py': {'mean': 4.86, 'std': 0.0549, 'min': 4.69, 'max': 4.91, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py': {'mean': 5.07, 'std': 0.0631, 'min': 4.88, 'max': 5.16, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '82_Conv2d_Tanh_Scaling_BiasAdd_Max.py': {'mean': 14.7, 'std': 0.266, 'min': 14.4, 'max': 15.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '83_Conv3d_GroupNorm_Min_Clamp_Dropout.py': {'mean': 6.39, 'std': 0.00941, 'min': 6.38, 'max': 6.42, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '84_Gemm_BatchNorm_Scaling_Softmax.py': {'mean': 4.98, 'std': 0.0634, 'min': 4.78, 'max': 5.05, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '85_Conv2d_GroupNorm_Scale_MaxPool_Clamp.py': {'mean': 3.62, 'std': 0.0682, 'min': 3.56, 'max': 3.73, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '86_Matmul_Divide_GELU.py': {'mean': 4.92, 'std': 0.0618, 'min': 4.73, 'max': 5.0, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '87_Conv2d_Subtract_Subtract_Mish.py': {'mean': 13.0, 'std': 0.217, 'min': 12.7, 'max': 13.4, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '88_Gemm_GroupNorm_Swish_Multiply_Swish.py': {'mean': 5.24, 'std': 0.0501, 'min': 5.08, 'max': 5.3, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '89_ConvTranspose3d_MaxPool_Softmax_Subtract_Swish_Max.py': {'mean': 7.71, 'std': 0.121, 'min': 7.55, 'max': 7.86, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '90_Conv3d_LeakyReLU_Sum_Clamp_GELU.py': {'mean': 16.8, 'std': 0.404, 'min': 16.3, 'max': 17.4, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '91_ConvTranspose2d_Softmax_BiasAdd_Scaling_Sigmoid.py': {'mean': 13.4, 'std': 0.0406, 'min': 13.3, 'max': 13.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '92_Conv2d_GroupNorm_Tanh_HardSwish_ResidualAdd_LogSumExp.py': {'mean': 6.36, 'std': 0.00427, 'min': 6.35, 'max': 6.38, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '93_ConvTranspose2d_Add_Min_GELU_Multiply.py': {'mean': 8.78, 'std': 0.0219, 'min': 8.72, 'max': 8.83, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '94_Gemm_BiasAdd_Hardtanh_Mish_GroupNorm.py': {'mean': 5.18, 'std': 0.0485, 'min': 5.0, 'max': 5.26, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '95_Matmul_Add_Swish_Tanh_GELU_Hardtanh.py': {'mean': 5.12, 'std': 0.068, 'min': 4.89, 'max': 5.18, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '96_ConvTranspose3d_Multiply_Max_GlobalAvgPool_Clamp.py': {'mean': 7.44, 'std': 0.13, 'min': 7.28, 'max': 7.59, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '97_Matmul_BatchNorm_BiasAdd_Divide_Swish.py': {'mean': 5.11, 'std': 0.0607, 'min': 4.92, 'max': 5.18, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '98_Matmul_AvgPool_GELU_Scale_Max.py': {'mean': 4.95, 'std': 0.0765, 'min': 4.72, 'max': 5.05, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '99_Matmul_GELU_Softmax.py': {'mean': 4.95, 'std': 0.0732, 'min': 4.73, 'max': 5.01, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '100_ConvTranspose3d_Clamp_Min_Divide.py': {'mean': 20.6, 'std': 0.0904, 'min': 20.5, 'max': 20.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}}, 'level3': {'1_MLP.py': {'mean': 5.52, 'std': 0.075, 'min': 5.25, 'max': 5.57, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '2_ShallowWideMLP.py': {'mean': 20.9, 'std': 0.849, 'min': 19.5, 'max': 22.4, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '3_DeepNarrowMLP.py': {'mean': 2.79, 'std': 0.011, 'min': 2.77, 'max': 2.81, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '4_LeNet5.py': {'mean': 3.92, 'std': 0.00875, 'min': 3.91, 'max': 3.97, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '5_AlexNet.py': {'mean': 35.7, 'std': 0.609, 'min': 34.4, 'max': 37.2, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '6_GoogleNetInceptionModule.py': {'mean': 13.0, 'std': 0.283, 'min': 12.3, 'max': 13.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '7_GoogleNetInceptionV1.py': {'mean': 2.11, 'std': 0.042, 'min': 2.09, 'max': 2.52, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '8_ResNetBasicBlock.py': {'mean': 2.31, 'std': 0.006, 'min': 2.29, 'max': 2.33, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '9_ResNet18.py': {'mean': 1.38, 'std': 0.0764, 'min': 1.32, 'max': 1.91, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '10_ResNet101.py': {'mean': 6.71, 'std': 0.0127, 'min': 6.68, 'max': 6.79, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '11_VGG16.py': {'mean': 4.9, 'std': 0.0316, 'min': 4.82, 'max': 4.96, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '12_VGG19.py': {'mean': 5.59, 'std': 0.051, 'min': 5.45, 'max': 5.69, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '13_DenseNet121TransitionLayer.py': {'mean': 11.0, 'std': 0.0097, 'min': 10.9, 'max': 11.0, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '14_DenseNet121DenseBlock.py': {'mean': 10.5, 'std': 0.108, 'min': 10.4, 'max': 10.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '15_DenseNet121.py': {'mean': 5.32, 'std': 0.357, 'min': 5.09, 'max': 6.36, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '16_DenseNet201.py': {'mean': 9.69, 'std': 0.0119, 'min': 9.65, 'max': 9.71, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '17_SqueezeNetFireModule.py': {'mean': 20.2, 'std': 0.247, 'min': 19.9, 'max': 20.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '18_SqueezeNet.py': {'mean': 39.1, 'std': 0.605, 'min': 38.2, 'max': 39.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '19_MobileNetV1.py': {'mean': 1.46, 'std': 0.0104, 'min': 1.45, 'max': 1.53, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '20_MobileNetV2.py': {'mean': 2.74, 'std': 0.102, 'min': 2.66, 'max': 3.38, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '21_EfficientNetMBConv.py': {'mean': 8.28, 'std': 0.106, 'min': 8.09, 'max': 8.68, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '22_EfficientNetB0.py': {'mean': 2.71, 'std': 0.203, 'min': 2.24, 'max': 3.04, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '23_EfficientNetB1.py': {'mean': 1.42, 'std': 0.007, 'min': 1.41, 'max': 1.47, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '24_EfficientNetB2.py': {'mean': 1.23, 'std': 0.0222, 'min': 1.2, 'max': 1.36, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '25_ShuffleNetUnit.py': {'mean': 14.6, 'std': 0.18, 'min': 14.4, 'max': 15.0, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '26_ShuffleNet.py': {'mean': 35.3, 'std': 0.0435, 'min': 35.2, 'max': 35.4, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '27_RegNet.py': {'mean': 3.24, 'std': 0.0162, 'min': 3.21, 'max': 3.27, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '28_VisionTransformer.py': {'mean': 2.75, 'std': 0.0636, 'min': 2.66, 'max': 3.08, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '29_SwinMLP.py': {'mean': 7.81, 'std': 0.892, 'min': 7.35, 'max': 12.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '30_SwinTransformerV2.py': {'mean': 11.5, 'std': 2.85, 'min': 9.86, 'max': 18.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '31_VisionAttention.py': {'mean': 37.2, 'std': 0.616, 'min': 36.0, 'max': 38.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '32_ConvolutionalVisionTransformer.py': {'mean': 1.58, 'std': 0.036, 'min': 1.53, 'max': 1.74, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '33_VanillaRNN.py': {'mean': 13.3, 'std': 0.51, 'min': 12.0, 'max': 13.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '34_VanillaRNNHidden.py': {'mean': 12.6, 'std': 0.672, 'min': 12.4, 'max': 18.8, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '35_LSTM.py': {'mean': 29.3, 'std': 0.773, 'min': 29.0, 'max': 36.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '36_LSTMHn.py': {'mean': 29.3, 'std': 0.523, 'min': 29.0, 'max': 32.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '37_LSTMCn.py': {'mean': 29.3, 'std': 0.539, 'min': 28.9, 'max': 32.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '38_LSTMBidirectional.py': {'mean': 57.2, 'std': 0.959, 'min': 56.7, 'max': 64.9, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '39_GRU.py': {'mean': 21.8, 'std': 0.247, 'min': 21.6, 'max': 24.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '40_GRUHidden.py': {'mean': 21.9, 'std': 0.825, 'min': 21.6, 'max': 27.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '41_GRUBidirectional.py': {'mean': 43.3, 'std': 0.934, 'min': 42.5, 'max': 48.5, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '42_GRUBidirectionalHidden.py': {'mean': 43.0, 'std': 0.423, 'min': 42.4, 'max': 45.7, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '43_MinGPTCausalAttention.py': {'mean': 19.9, 'std': 0.579, 'min': 19.1, 'max': 21.1, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '44_MiniGPTBlock.py': {'mean': 47.6, 'std': 1.06, 'min': 45.7, 'max': 49.4, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '45_UNetSoftmax.py': {'mean': 7.11, 'std': 0.0301, 'min': 7.04, 'max': 7.22, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '46_NetVladWithGhostClusters.py': {'mean': 2.18, 'std': 0.0536, 'min': 2.08, 'max': 2.25, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '47_NetVladNoGhostClusters.py': {'mean': 1.97, 'std': 0.0383, 'min': 1.89, 'max': 2.02, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '48_Mamba2ReturnY.py': {'mean': 12.3, 'std': 0.18, 'min': 12.1, 'max': 12.6, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '49_Mamba2ReturnFinalState.py': {'mean': 9.18, 'std': 0.116, 'min': 9.11, 'max': 9.47, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}, '50_ReLUSelfAttention.py': {'mean': 7.74, 'std': 0.0749, 'min': 7.51, 'max': 7.92, 'num_trials': 100, 'hardware': 'NVIDIA H100 PCIe', 'device': 'cuda:0'}}}\n"
     ]
    }
   ],
   "source": [
    "print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "level1/97_ScaledDotProductAttention.py: 0.243\n",
      "level2/38_ConvTranspose3d_AvgPool_Clamp_Softmax_Multiply.py: 1.16\n",
      "level1/50_conv_standard_2D__square_input__square_kernel.py: 1.2\n",
      "level3/24_EfficientNetB2.py: 1.23\n",
      "level2/62_Matmul_GroupNorm_LeakyReLU_Sum.py: 1.29\n",
      "level2/27_Conv3d_HardSwish_GroupNorm_Mean.py: 1.37\n",
      "level3/9_ResNet18.py: 1.38\n",
      "level3/23_EfficientNetB1.py: 1.42\n",
      "level2/6_Conv3d_Softmax_MaxPool_MaxPool.py: 1.46\n",
      "level3/19_MobileNetV1.py: 1.46\n",
      "level1/10_3D_tensor_matrix_multiplication.py: 1.53\n",
      "level3/32_ConvolutionalVisionTransformer.py: 1.58\n",
      "level2/79_Conv3d_Multiply_InstanceNorm_Clamp_Multiply_Max.py: 1.64\n",
      "level1/95_CrossEntropyLoss.py: 1.67\n",
      "level2/24_Conv3d_Min_Softmax.py: 1.76\n",
      "level1/54_conv_standard_3D__square_input__square_kernel.py: 1.79\n",
      "level1/6_Matmul_with_large_K_dimension_.py: 1.83\n",
      "level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py: 1.93\n",
      "level3/47_NetVladNoGhostClusters.py: 1.97\n",
      "level3/7_GoogleNetInceptionV1.py: 2.11\n",
      "level2/74_ConvTranspose3d_LeakyReLU_Multiply_LeakyReLU_Max.py: 2.16\n",
      "level2/68_Matmul_Min_Subtract.py: 2.17\n",
      "level2/66_Matmul_Dropout_Softmax.py: 2.18\n",
      "level3/46_NetVladWithGhostClusters.py: 2.18\n",
      "level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py: 2.22\n",
      "level2/36_ConvTranspose2d_Min_Sum_GELU_Add.py: 2.22\n",
      "level2/23_Conv3d_GroupNorm_Mean.py: 2.23\n",
      "level3/8_ResNetBasicBlock.py: 2.31\n",
      "level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py: 2.43\n",
      "level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py: 2.52\n",
      "level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py: 2.62\n",
      "level2/15_ConvTranspose3d_BatchNorm_Subtract.py: 2.62\n",
      "level1/88_MinGPTNewGelu.py: 2.64\n",
      "level2/69_Conv2d_HardSwish_ReLU.py: 2.64\n",
      "level2/71_Conv2d_Divide_LeakyReLU.py: 2.64\n",
      "level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py: 2.7\n",
      "level3/22_EfficientNetB0.py: 2.71\n",
      "level1/59_conv_standard_3D__asymmetric_input__square_kernel.py: 2.74\n",
      "level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py: 2.74\n",
      "level3/20_MobileNetV2.py: 2.74\n",
      "level3/28_VisionTransformer.py: 2.75\n",
      "level3/3_DeepNarrowMLP.py: 2.79\n",
      "level1/74_conv_transposed_1D_dilated.py: 2.9\n",
      "level2/61_ConvTranspose3d_ReLU_GroupNorm.py: 3.0\n",
      "level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py: 3.03\n",
      "level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py: 3.03\n",
      "level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py: 3.2\n",
      "level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py: 3.22\n",
      "level3/27_RegNet.py: 3.24\n",
      "level1/82_conv_depthwise_2D_square_input_square_kernel.py: 3.36\n",
      "level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py: 3.46\n",
      "level1/16_Matmul_with_transposed_A.py: 3.5\n",
      "level2/73_Conv2d_BatchNorm_Scaling.py: 3.51\n",
      "level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py: 3.58\n",
      "level1/18_Matmul_with_transposed_both.py: 3.62\n",
      "level2/85_Conv2d_GroupNorm_Scale_MaxPool_Clamp.py: 3.62\n",
      "level1/12_Matmul_with_diagonal_matrices_.py: 3.68\n",
      "level1/14_Matmul_for_upper_triangular_matrices.py: 3.7\n",
      "level1/15_Matmul_for_lower_triangular_matrices.py: 3.7\n",
      "level1/1_Square_matrix_multiplication_.py: 3.75\n",
      "level1/13_Matmul_for_symmetric_matrices.py: 3.78\n",
      "level2/11_ConvTranspose2d_BatchNorm_Tanh_MaxPool_GroupNorm.py: 3.82\n",
      "level3/4_LeNet5.py: 3.92\n",
      "level2/47_Conv3d_Mish_Tanh.py: 3.98\n",
      "level1/67_conv_standard_1D.py: 3.99\n",
      "level2/43_Conv3d_Max_LogSumExp_ReLU.py: 4.02\n",
      "level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py: 4.32\n",
      "level1/9_Tall_skinny_matrix_multiplication_.py: 4.35\n",
      "level1/4_Matrix_vector_multiplication_.py: 4.5\n",
      "level1/2_Standard_matrix_multiplication_.py: 4.56\n",
      "level2/57_Conv2d_ReLU_HardSwish.py: 4.59\n",
      "level1/47_Sum_reduction_over_a_dimension.py: 4.6\n",
      "level1/48_Mean_reduction_over_a_dimension.py: 4.6\n",
      "level1/5_Matrix_scalar_multiplication.py: 4.64\n",
      "level1/51_Argmax_over_a_dimension.py: 4.75\n",
      "level1/52_Argmin_over_a_dimension.py: 4.75\n",
      "level1/49_Max_reduction_over_a_dimension.py: 4.76\n",
      "level1/53_Min_reduction_over_a_dimension.py: 4.76\n",
      "level2/12_Gemm_Multiply_LeakyReLU.py: 4.82\n",
      "level2/49_ConvTranspose3d_Softmax_Sigmoid.py: 4.82\n",
      "level2/18_Matmul_Sum_Max_AvgPool_LogSumExp_LogSumExp.py: 4.85\n",
      "level2/29_Matmul_Mish_Mish.py: 4.85\n",
      "level2/9_Matmul_Subtract_Multiply_ReLU.py: 4.86\n",
      "level2/80_Gemm_Max_Subtract_GELU.py: 4.86\n",
      "level2/63_Gemm_ReLU_Divide.py: 4.89\n",
      "level3/11_VGG16.py: 4.9\n",
      "level2/33_Gemm_Scale_BatchNorm.py: 4.92\n",
      "level2/86_Matmul_Divide_GELU.py: 4.92\n",
      "level2/28_BMM_InstanceNorm_Sum_ResidualAdd_Multiply.py: 4.94\n",
      "level2/70_Gemm_Sigmoid_Scaling_ResidualAdd.py: 4.94\n",
      "level2/98_Matmul_AvgPool_GELU_Scale_Max.py: 4.95\n",
      "level2/99_Matmul_GELU_Softmax.py: 4.95\n",
      "level2/64_Gemm_LogSumExp_LeakyReLU_LeakyReLU_GELU_GELU.py: 4.96\n",
      "level2/30_Gemm_GroupNorm_Hardtanh.py: 4.98\n",
      "level2/84_Gemm_BatchNorm_Scaling_Softmax.py: 4.98\n",
      "level2/22_Matmul_Scale_ResidualAdd_Clamp_LogSumExp_Mish.py: 4.99\n",
      "level1/17_Matmul_with_transposed_B.py: 5.0\n",
      "level2/14_Gemm_Divide_Sum_Scaling.py: 5.05\n",
      "level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py: 5.07\n",
      "level2/97_Matmul_BatchNorm_BiasAdd_Divide_Swish.py: 5.11\n",
      "level2/95_Matmul_Add_Swish_Tanh_GELU_Hardtanh.py: 5.12\n",
      "level1/86_conv_depthwise_separable_2D.py: 5.16\n",
      "level2/94_Gemm_BiasAdd_Hardtanh_Mish_GroupNorm.py: 5.18\n",
      "level2/76_Gemm_Add_ReLU.py: 5.19\n",
      "level2/75_Gemm_GroupNorm_Min_BiasAdd.py: 5.23\n",
      "level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py: 5.24\n",
      "level2/88_Gemm_GroupNorm_Swish_Multiply_Swish.py: 5.24\n",
      "level1/43_Max_Pooling_3D.py: 5.27\n",
      "level3/15_DenseNet121.py: 5.32\n",
      "level2/52_Conv2d_Activation_BatchNorm.py: 5.47\n",
      "level2/78_ConvTranspose3d_Max_Max_Sum.py: 5.51\n",
      "level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py: 5.52\n",
      "level3/1_MLP.py: 5.52\n",
      "level1/62_conv_standard_2D__square_input__asymmetric_kernel.py: 5.58\n",
      "level3/12_VGG19.py: 5.59\n",
      "level2/20_ConvTranspose3d_Sum_ResidualAdd_Multiply_ResidualAdd.py: 5.61\n",
      "level2/77_ConvTranspose3d_Scale_BatchNorm_GlobalAvgPool.py: 5.72\n",
      "level2/48_Conv3d_Scaling_Tanh_Multiply_Sigmoid.py: 5.9\n",
      "level1/98_KLDivLoss.py: 6.29\n",
      "level1/7_Matmul_with_small_K_dimension_.py: 6.31\n",
      "level2/92_Conv2d_GroupNorm_Tanh_HardSwish_ResidualAdd_LogSumExp.py: 6.36\n",
      "level2/83_Conv3d_GroupNorm_Min_Clamp_Dropout.py: 6.39\n",
      "level1/55_conv_standard_2D__asymmetric_input__square_kernel.py: 6.44\n",
      "level1/61_conv_transposed_3D__square_input__square_kernel.py: 6.44\n",
      "level2/1_Conv2D_ReLU_BiasAdd.py: 6.61\n",
      "level3/10_ResNet101.py: 6.71\n",
      "level1/99_TripletMarginLoss.py: 6.85\n",
      "level1/19_ReLU.py: 6.95\n",
      "level1/20_LeakyReLU.py: 6.95\n",
      "level1/21_Sigmoid.py: 6.95\n",
      "level1/22_Tanh.py: 6.95\n",
      "level1/26_GELU_.py: 6.95\n",
      "level1/27_SELU_.py: 6.95\n",
      "level1/28_HardSigmoid.py: 6.95\n",
      "level1/31_ELU.py: 6.95\n",
      "level1/32_HardTanh.py: 6.95\n",
      "level1/29_Softplus.py: 6.97\n",
      "level3/45_UNetSoftmax.py: 7.11\n",
      "level1/60_conv_standard_3D__square_input__asymmetric_kernel.py: 7.17\n",
      "level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py: 7.29\n",
      "level1/89_cumsum.py: 7.33\n",
      "level1/90_cumprod.py: 7.33\n",
      "level2/96_ConvTranspose3d_Multiply_Max_GlobalAvgPool_Clamp.py: 7.44\n",
      "level2/17_Conv2d_InstanceNorm_Divide.py: 7.45\n",
      "level1/87_conv_pointwise_2D.py: 7.63\n",
      "level2/50_ConvTranspose3d_Scaling_AvgPool_BiasAdd_Scaling.py: 7.68\n",
      "level2/89_ConvTranspose3d_MaxPool_Softmax_Subtract_Swish_Max.py: 7.71\n",
      "level3/50_ReLUSelfAttention.py: 7.74\n",
      "level2/31_Conv2d_Min_Add_Multiply.py: 7.76\n",
      "level3/29_SwinMLP.py: 7.81\n",
      "level2/35_Conv2d_Subtract_HardSwish_MaxPool_Mish.py: 7.97\n",
      "level1/8_Matmul_with_irregular_shapes_.py: 7.98\n",
      "level1/64_conv_transposed_1D.py: 8.05\n",
      "level2/26_ConvTranspose3d_Add_HardSwish.py: 8.09\n",
      "level3/21_EfficientNetMBConv.py: 8.28\n",
      "level2/53_Gemm_Scaling_Hardtanh_GELU.py: 8.33\n",
      "level2/51_Gemm_Subtract_GlobalAvgPool_LogSumExp_GELU_ResidualAdd.py: 8.41\n",
      "level2/8_Conv3d_Divide_Max_GlobalAvgPool_BiasAdd_Sum.py: 8.51\n",
      "level2/93_ConvTranspose2d_Add_Min_GELU_Multiply.py: 8.78\n",
      "level1/40_LayerNorm.py: 8.94\n",
      "level1/96_HuberLoss.py: 9.0\n",
      "level3/49_Mamba2ReturnFinalState.py: 9.18\n",
      "level1/57_conv_transposed_2D__square_input__square_kernel.py: 9.2\n",
      "level2/54_Conv2d_Multiply_LeakyReLU_GELU.py: 9.24\n",
      "level2/46_Conv2d_Subtract_Tanh_Subtract_AvgPool.py: 9.37\n",
      "level2/67_Conv2d_GELU_GlobalAvgPool.py: 9.49\n",
      "level3/16_DenseNet201.py: 9.69\n",
      "level1/45_Average_Pooling_2D.py: 9.76\n",
      "level2/58_ConvTranspose3d_LogSumExp_HardSwish_Subtract_Clamp.py: 9.81\n",
      "level2/25_Conv2d_Min_Tanh_Tanh.py: 9.91\n",
      "level1/63_conv_standard_2D__square_input__square_kernel.py: 10.1\n",
      "level2/21_Conv2d_Add_Scale_Sigmoid_GroupNorm.py: 10.5\n",
      "level3/14_DenseNet121DenseBlock.py: 10.5\n",
      "level2/42_ConvTranspose2d_GlobalAvgPool_BiasAdd_LogSumExp_Sum_Multiply.py: 10.8\n",
      "level2/55_Matmul_MaxPool_Sum_Scale.py: 10.9\n",
      "level2/56_Matmul_Sigmoid_Sum.py: 10.9\n",
      "level2/59_Matmul_Swish_Scaling.py: 10.9\n",
      "level3/13_DenseNet121TransitionLayer.py: 11.0\n",
      "level2/60_ConvTranspose3d_Swish_GroupNorm_HardSwish.py: 11.3\n",
      "level1/46_Average_Pooling_3D.py: 11.4\n",
      "level3/30_SwinTransformerV2.py: 11.5\n",
      "level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py: 12.1\n",
      "level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py: 12.1\n",
      "level2/37_Matmul_Swish_Sum_GroupNorm.py: 12.2\n",
      "level3/48_Mamba2ReturnY.py: 12.3\n",
      "level3/34_VanillaRNNHidden.py: 12.6\n",
      "level1/37_FrobeniusNorm_.py: 12.8\n",
      "level1/92_cumsum_exclusive.py: 12.8\n",
      "level2/45_Gemm_Sigmoid_LogSumExp.py: 12.9\n",
      "level2/87_Conv2d_Subtract_Subtract_Mish.py: 13.0\n",
      "level3/6_GoogleNetInceptionModule.py: 13.0\n",
      "level1/93_masked_cumsum.py: 13.2\n",
      "level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py: 13.3\n",
      "level3/33_VanillaRNN.py: 13.3\n",
      "level2/91_ConvTranspose2d_Softmax_BiasAdd_Scaling_Sigmoid.py: 13.4\n",
      "level2/13_ConvTranspose3d_Mean_Add_Softmax_Tanh_Scaling.py: 13.5\n",
      "level1/94_MSELoss.py: 13.6\n",
      "level1/24_LogSoftmax.py: 13.8\n",
      "level1/35_GroupNorm_.py: 13.8\n",
      "level1/34_InstanceNorm.py: 13.9\n",
      "level1/23_Softmax.py: 14.3\n",
      "level2/10_ConvTranspose2d_MaxPool_Hardtanh_Mean_Tanh.py: 14.6\n",
      "level3/25_ShuffleNetUnit.py: 14.6\n",
      "level2/82_Conv2d_Tanh_Scaling_BiasAdd_Max.py: 14.7\n",
      "level1/39_L2Norm_.py: 14.8\n",
      "level1/100_HingeLoss.py: 16.6\n",
      "level2/90_Conv3d_LeakyReLU_Sum_Clamp_GELU.py: 16.8\n",
      "level1/25_Swish.py: 17.1\n",
      "level2/39_Gemm_Scale_BatchNorm.py: 17.1\n",
      "level2/40_Matmul_Scaling_ResidualAdd.py: 17.2\n",
      "level2/16_ConvTranspose2d_Mish_Add_Hardtanh_Scaling.py: 17.3\n",
      "level2/41_Gemm_BatchNorm_GELU_ReLU.py: 17.4\n",
      "level1/91_cumsum_reverse.py: 17.5\n",
      "level2/19_ConvTranspose2d_GELU_GroupNorm.py: 18.3\n",
      "level2/65_Conv2d_AvgPool_Sigmoid_Sum.py: 18.3\n",
      "level1/76_conv_standard_1D_dilated_strided__.py: 19.1\n",
      "level1/33_BatchNorm.py: 19.6\n",
      "level2/2_ConvTranspose2d_BiasAdd_Clamp_Scaling_Clamp_Divide.py: 19.9\n",
      "level3/43_MinGPTCausalAttention.py: 19.9\n",
      "level3/17_SqueezeNetFireModule.py: 20.2\n",
      "level2/100_ConvTranspose3d_Clamp_Min_Divide.py: 20.6\n",
      "level3/2_ShallowWideMLP.py: 20.9\n",
      "level1/44_Average_Pooling_1D.py: 21.2\n",
      "level1/36_RMSNorm_.py: 21.7\n",
      "level3/39_GRU.py: 21.8\n",
      "level3/40_GRUHidden.py: 21.9\n",
      "level1/38_L1Norm_.py: 24.0\n",
      "level1/30_Softsign.py: 24.1\n",
      "level2/32_Conv2d_Scaling_Min.py: 24.9\n",
      "level1/11_4D_tensor_matrix_multiplication.py: 26.4\n",
      "level2/4_Conv2d_Mish_Mish.py: 27.3\n",
      "level1/41_Max_Pooling_1D.py: 27.9\n",
      "level1/42_Max_Pooling_2D.py: 28.0\n",
      "level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py: 28.0\n",
      "level2/72_ConvTranspose3d_BatchNorm_AvgPool_AvgPool.py: 29.2\n",
      "level3/35_LSTM.py: 29.3\n",
      "level3/36_LSTMHn.py: 29.3\n",
      "level3/37_LSTMCn.py: 29.3\n",
      "level3/26_ShuffleNet.py: 35.3\n",
      "level3/5_AlexNet.py: 35.7\n",
      "level3/31_VisionAttention.py: 37.2\n",
      "level3/18_SqueezeNet.py: 39.1\n",
      "level3/42_GRUBidirectionalHidden.py: 43.0\n",
      "level3/41_GRUBidirectional.py: 43.3\n",
      "level3/44_MiniGPTBlock.py: 47.6\n",
      "level2/5_ConvTranspose2d_Subtract_Tanh.py: 55.8\n",
      "level3/38_LSTMBidirectional.py: 57.2\n",
      "level2/3_ConvTranspose3d_Sum_LayerNorm_AvgPool_GELU.py: 62.4\n",
      "level2/34_ConvTranspose3d_LayerNorm_GELU_Scaling.py: 62.9\n",
      "level1/3_Batched_matrix_multiplication.py: 67.7\n"
     ]
    }
   ],
   "source": [
    "times = []\n",
    "for level in [1, 2, 3]:\n",
    "    levelx_data = data[f\"level{level}\"]\n",
    "    for problem_name, problem_data in levelx_data.items():\n",
    "        times.append((problem_data[\"mean\"], f\"level{level}/{problem_name}\"))\n",
    "\n",
    "times.sort(key=lambda x: x[0])\n",
    "\n",
    "for time, problem_name in times:\n",
    "    print(f\"{problem_name}: {time}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kernel-bench",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
